{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Learning More About Pandas by Building and Using a Weighted Average Function"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook is based on the article on [Pbpython.com](http://pbpython.com/weighted-average.html). Please reference it for the background and additional details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read in our sample sales data that includes projected price for our new product launch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Account</th>\n",
       "      <th>Name</th>\n",
       "      <th>State</th>\n",
       "      <th>Rep</th>\n",
       "      <th>Manager</th>\n",
       "      <th>Current_Price</th>\n",
       "      <th>Quantity</th>\n",
       "      <th>New_Product_Price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>714466</td>\n",
       "      <td>Trantow-Barrows</td>\n",
       "      <td>MN</td>\n",
       "      <td>Craig Booker</td>\n",
       "      <td>Debra Henley</td>\n",
       "      <td>500</td>\n",
       "      <td>100</td>\n",
       "      <td>550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>737550</td>\n",
       "      <td>Fritsch, Russel and Anderson</td>\n",
       "      <td>MN</td>\n",
       "      <td>Craig Booker</td>\n",
       "      <td>Debra Henley</td>\n",
       "      <td>600</td>\n",
       "      <td>90</td>\n",
       "      <td>725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>146832</td>\n",
       "      <td>Kiehn-Spinka</td>\n",
       "      <td>TX</td>\n",
       "      <td>Daniel Hilton</td>\n",
       "      <td>Debra Henley</td>\n",
       "      <td>225</td>\n",
       "      <td>475</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>218895</td>\n",
       "      <td>Kulas Inc</td>\n",
       "      <td>TX</td>\n",
       "      <td>Daniel Hilton</td>\n",
       "      <td>Debra Henley</td>\n",
       "      <td>290</td>\n",
       "      <td>375</td>\n",
       "      <td>300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>412290</td>\n",
       "      <td>Jerde-Hilpert</td>\n",
       "      <td>WI</td>\n",
       "      <td>John Smith</td>\n",
       "      <td>Debra Henley</td>\n",
       "      <td>375</td>\n",
       "      <td>400</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>740150</td>\n",
       "      <td>Barton LLC</td>\n",
       "      <td>WI</td>\n",
       "      <td>John Smith</td>\n",
       "      <td>Debra Henley</td>\n",
       "      <td>550</td>\n",
       "      <td>100</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>141962</td>\n",
       "      <td>Herman LLC</td>\n",
       "      <td>CA</td>\n",
       "      <td>Cedric Moss</td>\n",
       "      <td>Fred Anderson</td>\n",
       "      <td>400</td>\n",
       "      <td>200</td>\n",
       "      <td>425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>163416</td>\n",
       "      <td>Purdy-Kunde</td>\n",
       "      <td>CA</td>\n",
       "      <td>Cedric Moss</td>\n",
       "      <td>Fred Anderson</td>\n",
       "      <td>450</td>\n",
       "      <td>150</td>\n",
       "      <td>475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>239344</td>\n",
       "      <td>Stokes LLC</td>\n",
       "      <td>WA</td>\n",
       "      <td>Cedric Moss</td>\n",
       "      <td>Fred Anderson</td>\n",
       "      <td>550</td>\n",
       "      <td>75</td>\n",
       "      <td>610</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>307599</td>\n",
       "      <td>Kassulke, Ondricka and Metz</td>\n",
       "      <td>NV</td>\n",
       "      <td>Wendy Yule</td>\n",
       "      <td>Fred Anderson</td>\n",
       "      <td>275</td>\n",
       "      <td>450</td>\n",
       "      <td>300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>688981</td>\n",
       "      <td>Keeling LLC</td>\n",
       "      <td>NV</td>\n",
       "      <td>Wendy Yule</td>\n",
       "      <td>Fred Anderson</td>\n",
       "      <td>300</td>\n",
       "      <td>250</td>\n",
       "      <td>350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>729833</td>\n",
       "      <td>Koepp Ltd</td>\n",
       "      <td>NV</td>\n",
       "      <td>Wendy Yule</td>\n",
       "      <td>Fred Anderson</td>\n",
       "      <td>350</td>\n",
       "      <td>100</td>\n",
       "      <td>375</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Account                          Name State            Rep        Manager  \\\n",
       "0    714466               Trantow-Barrows    MN   Craig Booker   Debra Henley   \n",
       "1    737550  Fritsch, Russel and Anderson    MN   Craig Booker   Debra Henley   \n",
       "2    146832                  Kiehn-Spinka    TX  Daniel Hilton   Debra Henley   \n",
       "3    218895                     Kulas Inc    TX  Daniel Hilton   Debra Henley   \n",
       "4    412290                 Jerde-Hilpert    WI     John Smith   Debra Henley   \n",
       "5    740150                    Barton LLC    WI     John Smith   Debra Henley   \n",
       "6    141962                    Herman LLC    CA    Cedric Moss  Fred Anderson   \n",
       "7    163416                   Purdy-Kunde    CA    Cedric Moss  Fred Anderson   \n",
       "8    239344                    Stokes LLC    WA    Cedric Moss  Fred Anderson   \n",
       "9    307599   Kassulke, Ondricka and Metz    NV     Wendy Yule  Fred Anderson   \n",
       "10   688981                   Keeling LLC    NV     Wendy Yule  Fred Anderson   \n",
       "11   729833                     Koepp Ltd    NV     Wendy Yule  Fred Anderson   \n",
       "\n",
       "    Current_Price  Quantity  New_Product_Price  \n",
       "0             500       100                550  \n",
       "1             600        90                725  \n",
       "2             225       475                255  \n",
       "3             290       375                300  \n",
       "4             375       400                400  \n",
       "5             550       100                600  \n",
       "6             400       200                425  \n",
       "7             450       150                475  \n",
       "8             550        75                610  \n",
       "9             275       450                300  \n",
       "10            300       250                350  \n",
       "11            350       100                375  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sales = pd.read_excel(\"https://github.com/chris1610/pbpython/blob/master/data/sales-estimate.xlsx?raw=True\", sheetname=\"projections\")\n",
    "sales"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Show the mean for our current and new product price"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "405.416666667\n",
      "447.083333333\n"
     ]
    }
   ],
   "source": [
    "print(sales[\"Current_Price\"].mean())\n",
    "print(sales[\"New_Product_Price\"].mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate the weighted average using the long form"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "342.540687161\n",
      "374.638336347\n"
     ]
    }
   ],
   "source": [
    "print((sales[\"Current_Price\"] * sales[\"Quantity\"]).sum() / sales[\"Quantity\"].sum())\n",
    "print((sales[\"New_Product_Price\"] * sales[\"Quantity\"]).sum() / sales[\"Quantity\"].sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use np.average to simplify the formula"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "342.540687161\n",
      "374.638336347\n"
     ]
    }
   ],
   "source": [
    "print(np.average(sales[\"Current_Price\"], weights=sales[\"Quantity\"]))\n",
    "print(np.average(sales[\"New_Product_Price\"], weights=sales[\"Quantity\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For maximum flexibility, build our own weighted average function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def wavg(group, avg_name, weight_name):\n",
    "    \"\"\" http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns\n",
    "    In rare instance, we may not have weights, so just return the mean\n",
    "    \"\"\"\n",
    "    d = group[avg_name]\n",
    "    w = group[weight_name]\n",
    "    try:\n",
    "        return (d * w).sum() / w.sum()\n",
    "    except ZeroDivisionError:\n",
    "        return d.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Call the weighted average on all of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "342.540687161\n",
      "374.638336347\n"
     ]
    }
   ],
   "source": [
    "print(wavg(sales, \"Current_Price\", \"Quantity\"))\n",
    "print(wavg(sales, \"New_Product_Price\", \"Quantity\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Group the data by manager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Manager\n",
       "Debra Henley     340.665584\n",
       "Fred Anderson    344.897959\n",
       "dtype: float64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sales.groupby(\"Manager\").apply(wavg, \"Current_Price\", \"Quantity\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Manager\n",
       "Debra Henley     372.646104\n",
       "Fred Anderson    377.142857\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sales.groupby(\"Manager\").apply(wavg, \"New_Product_Price\", \"Quantity\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can also group by state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "State\n",
       "CA    446.428571\n",
       "MN    632.894737\n",
       "NV    325.000000\n",
       "TX    274.852941\n",
       "WA    610.000000\n",
       "WI    440.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sales.groupby(\"State\").apply(wavg, \"New_Product_Price\", \"Quantity\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can also group by multiple criteria and the function will work correctly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Manager        State\n",
       "Debra Henley   MN       632.894737\n",
       "               TX       274.852941\n",
       "               WI       440.000000\n",
       "Fred Anderson  CA       446.428571\n",
       "               NV       325.000000\n",
       "               WA       610.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sales.groupby([\"Manager\", \"State\"]).apply(wavg, \"New_Product_Price\", \"Quantity\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Example of applying multiple aggregation functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">Quantity</th>\n",
       "      <th>New_Product_Price</th>\n",
       "      <th>Current_Price</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "      <th>mean</th>\n",
       "      <th>median</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Manager</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Debra Henley</th>\n",
       "      <td>1540</td>\n",
       "      <td>256.666667</td>\n",
       "      <td>471.666667</td>\n",
       "      <td>437.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fred Anderson</th>\n",
       "      <td>1225</td>\n",
       "      <td>204.166667</td>\n",
       "      <td>422.500000</td>\n",
       "      <td>375.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              Quantity             New_Product_Price Current_Price\n",
       "                   sum        mean              mean        median\n",
       "Manager                                                           \n",
       "Debra Henley      1540  256.666667        471.666667         437.5\n",
       "Fred Anderson     1225  204.166667        422.500000         375.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f = {'New_Product_Price': ['mean'],'Current_Price': ['median'], 'Quantity': ['sum', 'mean']}\n",
    "sales.groupby(\"Manager\").agg(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Similar method to group multiple custom functions together into a single DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_1 = sales.groupby(\"Manager\").apply(wavg, \"New_Product_Price\", \"Quantity\")\n",
    "data_2 = sales.groupby(\"Manager\").apply(wavg, \"Current_Price\", \"Quantity\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>New Product Price</th>\n",
       "      <th>Current Product Price</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Manager</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Debra Henley</th>\n",
       "      <td>372.646104</td>\n",
       "      <td>340.665584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fred Anderson</th>\n",
       "      <td>377.142857</td>\n",
       "      <td>344.897959</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               New Product Price  Current Product Price\n",
       "Manager                                                \n",
       "Debra Henley          372.646104             340.665584\n",
       "Fred Anderson         377.142857             344.897959"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary = pd.DataFrame(data=dict(s1=data_1, s2=data_2))\n",
    "summary.columns = [\"New Product Price\",\"Current Product Price\"]\n",
    "summary.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, numpy has an average function that can be used:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "342.54068716094031"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.average(sales[\"Current_Price\"], weights=sales[\"Quantity\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use a lambda function for it to work with grouped data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Manager\n",
       "Debra Henley     372.646104\n",
       "Fred Anderson    377.142857\n",
       "dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sales.groupby(\"Manager\").apply(lambda x: np.average(x['New_Product_Price'], weights=x['Quantity']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
